In [1]:
import re
import numpy as np
import pandas as pd
import seaborn as sns
import pickle
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier, XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR, SVC
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from pandas.core.dtypes.common import is_string_dtype, is_numeric_dtype
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 42)
label_encoder = LabelEncoder()
scaler=StandardScaler()
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
#¶

PART-A¶

#¶

1. Data Understanding & Exploration¶

A. Read ‘TelcomCustomer-Churn_1.csv’ as a DataFrame and assign it to a variable.¶

In [2]:
telecom_cust_churn1=pd.read_csv('TelcomCustomer-Churn_1.csv')
telecom_cust_churn1.head()
Out[2]:
customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity
0 7590-VHVEG Female 0 Yes No 1 No No phone service DSL No
1 5575-GNVDE Male 0 No No 34 Yes No DSL Yes
2 3668-QPYBK Male 0 No No 2 Yes No DSL Yes
3 7795-CFOCW Male 0 No No 45 No No phone service DSL Yes
4 9237-HQITU Female 0 No No 2 Yes No Fiber optic No
In [3]:
telecom_cust_churn1.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   customerID       7043 non-null   object
 1   gender           7043 non-null   object
 2   SeniorCitizen    7043 non-null   int64 
 3   Partner          7043 non-null   object
 4   Dependents       7043 non-null   object
 5   tenure           7043 non-null   int64 
 6   PhoneService     7043 non-null   object
 7   MultipleLines    7043 non-null   object
 8   InternetService  7043 non-null   object
 9   OnlineSecurity   7043 non-null   object
dtypes: int64(2), object(8)
memory usage: 550.4+ KB

TelcomCustomer-Churn_1 has 7043 data entries and 10 columns.

B. Read ‘TelcomCustomer-Churn_2.csv’ as a DataFrame and assign it to a variable.¶

In [4]:
telecom_cust_churn2=pd.read_csv('TelcomCustomer-Churn_2.csv')
telecom_cust_churn2.head()
Out[4]:
customerID OnlineBackup DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
0 7590-VHVEG Yes No No No No Month-to-month Yes Electronic check 29.85 29.85 No
1 5575-GNVDE No Yes No No No One year No Mailed check 56.95 1889.5 No
2 3668-QPYBK Yes No No No No Month-to-month Yes Mailed check 53.85 108.15 Yes
3 7795-CFOCW No Yes Yes No No One year No Bank transfer (automatic) 42.30 1840.75 No
4 9237-HQITU No No No No No Month-to-month Yes Electronic check 70.70 151.65 Yes
In [5]:
telecom_cust_churn2.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   OnlineBackup      7043 non-null   object 
 2   DeviceProtection  7043 non-null   object 
 3   TechSupport       7043 non-null   object 
 4   StreamingTV       7043 non-null   object 
 5   StreamingMovies   7043 non-null   object 
 6   Contract          7043 non-null   object 
 7   PaperlessBilling  7043 non-null   object 
 8   PaymentMethod     7043 non-null   object 
 9   MonthlyCharges    7043 non-null   float64
 10  TotalCharges      7043 non-null   object 
 11  Churn             7043 non-null   object 
dtypes: float64(1), object(11)
memory usage: 660.4+ KB

TelcomCustomer-Churn_2 has 7043 data entries and 12 columns.

C. Merge both the DataFrames on key ‘customerID’ to form a single DataFrame.¶

In [6]:
telecom_cust_churn=pd.merge(telecom_cust_churn1, telecom_cust_churn2, on='customerID')
telecom_cust_churn.head()
Out[6]:
customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity ... DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
0 7590-VHVEG Female 0 Yes No 1 No No phone service DSL No ... No No No No Month-to-month Yes Electronic check 29.85 29.85 No
1 5575-GNVDE Male 0 No No 34 Yes No DSL Yes ... Yes No No No One year No Mailed check 56.95 1889.5 No
2 3668-QPYBK Male 0 No No 2 Yes No DSL Yes ... No No No No Month-to-month Yes Mailed check 53.85 108.15 Yes
3 7795-CFOCW Male 0 No No 45 No No phone service DSL Yes ... Yes Yes No No One year No Bank transfer (automatic) 42.30 1840.75 No
4 9237-HQITU Female 0 No No 2 Yes No Fiber optic No ... No No No No Month-to-month Yes Electronic check 70.70 151.65 Yes

5 rows × 21 columns

In [7]:
telecom_cust_churn.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 
 17  PaymentMethod     7043 non-null   object 
 18  MonthlyCharges    7043 non-null   float64
 19  TotalCharges      7043 non-null   object 
 20  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(18)
memory usage: 1.2+ MB

Merged dataset contains 7043 entries and 21 columns.

D. Verify if all the columns are incorporated in the merged DataFrame by using simple comparison Operator in Python.¶

In [8]:
columns1=set(telecom_cust_churn1.columns.tolist().__add__(telecom_cust_churn2.columns.tolist()))
columns2=set(telecom_cust_churn.columns.tolist())
'Identical columns' if columns1==columns2 else 'Non-Identical columns'
Out[8]:
'Identical columns'
In [9]:
print("Columns of unmerged Dataset:-", ", ".join(sorted(set(telecom_cust_churn1.columns.tolist().__add__(telecom_cust_churn2.columns.tolist())))))
print()
print("Columns of merged Dataset:-", ", ".join(sorted(telecom_cust_churn.columns.tolist())))
Columns of unmerged Dataset:- Churn, Contract, Dependents, DeviceProtection, InternetService, MonthlyCharges, MultipleLines, OnlineBackup, OnlineSecurity, PaperlessBilling, Partner, PaymentMethod, PhoneService, SeniorCitizen, StreamingMovies, StreamingTV, TechSupport, TotalCharges, customerID, gender, tenure

Columns of merged Dataset:- Churn, Contract, Dependents, DeviceProtection, InternetService, MonthlyCharges, MultipleLines, OnlineBackup, OnlineSecurity, PaperlessBilling, Partner, PaymentMethod, PhoneService, SeniorCitizen, StreamingMovies, StreamingTV, TechSupport, TotalCharges, customerID, gender, tenure

2. Data Cleaning & Analysis¶

A. Impute missing/unexpected values in the DataFrame.¶

B. Make sure all the variables with continuous values are of ‘Float’ type.¶

In [10]:
telecom_cust_churn.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 
 17  PaymentMethod     7043 non-null   object 
 18  MonthlyCharges    7043 non-null   float64
 19  TotalCharges      7043 non-null   object 
 20  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(18)
memory usage: 1.2+ MB
In [11]:
telecom_cust_churn[telecom_cust_churn.duplicated()]
Out[11]:
customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity ... DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn

0 rows × 21 columns

No duplicates found in the dataset.

In [12]:
telecom_cust_churn.isnull().sum()
Out[12]:
customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

No empty data is found in the dataset.

In [13]:
columns=telecom_cust_churn.columns.drop(['customerID','tenure','MonthlyCharges','TotalCharges'])
for col in columns:
    print('ColumnName:',col)
    print(telecom_cust_churn[col].value_counts())
    print(telecom_cust_churn[col].size==7043)
    print()
ColumnName: gender
Male      3555
Female    3488
Name: gender, dtype: int64
True

ColumnName: SeniorCitizen
0    5901
1    1142
Name: SeniorCitizen, dtype: int64
True

ColumnName: Partner
No     3641
Yes    3402
Name: Partner, dtype: int64
True

ColumnName: Dependents
No     4933
Yes    2110
Name: Dependents, dtype: int64
True

ColumnName: PhoneService
Yes    6361
No      682
Name: PhoneService, dtype: int64
True

ColumnName: MultipleLines
No                  3390
Yes                 2971
No phone service     682
Name: MultipleLines, dtype: int64
True

ColumnName: InternetService
Fiber optic    3096
DSL            2421
No             1526
Name: InternetService, dtype: int64
True

ColumnName: OnlineSecurity
No                     3498
Yes                    2019
No internet service    1526
Name: OnlineSecurity, dtype: int64
True

ColumnName: OnlineBackup
No                     3088
Yes                    2429
No internet service    1526
Name: OnlineBackup, dtype: int64
True

ColumnName: DeviceProtection
No                     3095
Yes                    2422
No internet service    1526
Name: DeviceProtection, dtype: int64
True

ColumnName: TechSupport
No                     3473
Yes                    2044
No internet service    1526
Name: TechSupport, dtype: int64
True

ColumnName: StreamingTV
No                     2810
Yes                    2707
No internet service    1526
Name: StreamingTV, dtype: int64
True

ColumnName: StreamingMovies
No                     2785
Yes                    2732
No internet service    1526
Name: StreamingMovies, dtype: int64
True

ColumnName: Contract
Month-to-month    3875
Two year          1695
One year          1473
Name: Contract, dtype: int64
True

ColumnName: PaperlessBilling
Yes    4171
No     2872
Name: PaperlessBilling, dtype: int64
True

ColumnName: PaymentMethod
Electronic check             2365
Mailed check                 1612
Bank transfer (automatic)    1544
Credit card (automatic)      1522
Name: PaymentMethod, dtype: int64
True

ColumnName: Churn
No     5174
Yes    1869
Name: Churn, dtype: int64
True

As checked in dataset, TotalCharges needed to have float as dtype, need to analyse the data.

In [14]:
telecom_cust_churn.TotalCharges.sort_values()
Out[14]:
936           
3826          
4380          
753           
5218          
         ...  
6646    997.75
5598     998.1
3686    999.45
3353     999.8
2845     999.9
Name: TotalCharges, Length: 7043, dtype: object

As TotalCharges contains empty values, which need to be imputed with some central tendency value.

In [15]:
telecom_cust_churn.TotalCharges=telecom_cust_churn.TotalCharges.replace(' ', np.nan)
telecom_cust_churn.TotalCharges=pd.to_numeric(telecom_cust_churn.TotalCharges, errors='coerce')
telecom_cust_churn.TotalCharges.fillna(telecom_cust_churn.TotalCharges.mean(),inplace = True)
telecom_cust_churn.TotalCharges.isnull().sum()
Out[15]:
0

Imputed missing values with mean for TotalCharges columns.

In [16]:
telecom_cust_churn.dtypes
Out[16]:
customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

MonthlyCharges and TotalCharges have dtype as float64.

C. Create a function that will accept a DataFrame as input and return pie-charts for all the appropriate Categorical features. Clearly show percentage distribution in the pie-chart.¶

D. Share insights for Q2.c.¶

In [17]:
for col in telecom_cust_churn.select_dtypes(include='object').columns.drop('customerID'):
    fig=px.pie(telecom_cust_churn, hole=0.3, values=telecom_cust_churn[col].value_counts(), names=telecom_cust_churn[col].value_counts().index, color_discrete_sequence=px.colors.sequential.turbid_r, title=col+' Column Representation')
    fig.show()
fig=px.pie(telecom_cust_churn, hole=0.3, values=telecom_cust_churn.SeniorCitizen.value_counts(), names=telecom_cust_churn.SeniorCitizen.value_counts().index, color_discrete_sequence=px.colors.sequential.turbid_r, title='SeniorCitizen Column Representation')
fig.show();

Pie Chart Graphical Representation description:

  1. Gender column contains Gender as Male(3555) and Gender as Female(3488) distribution.
  2. SeniorCitizen column contains SeniorCitizen(1142) and not a SeniorCitizen(5901) distribution.
  3. Partner column contains Partner as YES(3402) and Partner as NO(3641) distribution.
  4. Dependents column contains Dependents as YES(2110) and Dependents as NO(4933) distribution.
  5. PhoneService column contains PhoneService as YES(6361) and PhoneService as NO(682) distribution.
  6. MultipleLines column contains MultipleLines as YES(2971), MultipleLines as NO(3390) and MultipleLines as No phone service(682) distribution.
  7. InternetService column contains InternetService as DSL(2421), InternetService as NO(1526) and InternetService as Fiber optic(3096) distribution.
  8. OnlineSecurity column contains OnlineSecurity as YES(2019), OnlineSecurity as NO(3498) and OnlineSecurity as No internet service(1526) distribution.
  9. OnlineBackup column contains OnlineBackup as YES(2429), OnlineBackup as NO(3498) and OnlineBackup as No internet service(1526) distribution.
  10. DeviceProtection column contains DeviceProtection as YES(2422), DeviceProtection as NO(3095) and DeviceProtection as No internet service(1526) distribution.
  11. TechSupport column contains TechSupport as YES(2044), TechSupport as NO(3473) and TechSupport as No internet service(1526) distribution.
  12. StreamingTV column contains StreamingTV as YES(2707), StreamingTV as NO(2810) and StreamingTV as No internet service(1526) distribution.
  13. StreamingMovies column contains StreamingMovies as YES(2732), StreamingMovies as NO(2785) and StreamingMovies as No internet service(1526) distribution.
  14. Contract column contains Contract as One year(1473), Contract as Two year(1695) and Contract as Month-to-month(3875) distribution.
  15. PaperlessBilling column contains PaperlessBilling as YES(4171) and PaperlessBilling as NO(2872) distribution.
  16. PaymentMethod column contains PaymentMethod as Electronic check(2365), PaymentMethod as Mailed check(1612), PaymentMethod as Bank transfer (automatic)(1544) and PaymentMethod as Credit card (automatic)(1522) distribution.
  17. Churn column contains Churn as YES(1869) and Churn as NO(5174) distribution.

E. Encode all the appropriate Categorical features with the best suitable approach.¶

Converting data having Yes/Male as 1 and No/Female as 0. Using get_dummies() function, converts categorical data into dummy or indicator variables.

In [18]:
# Duplicating dataset
telecom_cust_churn_dup=telecom_cust_churn.copy(deep=True)
telecom_cust_churn_dup.shape
Out[18]:
(7043, 21)
In [19]:
telecom_cust_churn.drop(columns='customerID', inplace=True, axis=1)
# Encoding gender column
telecom_cust_churn.gender=[1 if gender=='Male' else 0 for gender in telecom_cust_churn.gender]

# Columns with Yes/No values
for col in ['Partner','Dependents','PhoneService','PaperlessBilling','Churn']:
    telecom_cust_churn[col] = [1 if each == "Yes" else 0 for each in telecom_cust_churn[col]]

# Columns with get_dummies() usage

telecom_cust_churn = pd.get_dummies(telecom_cust_churn, columns=['InternetService','Contract', 'PaymentMethod', 'OnlineSecurity','MultipleLines','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies'], drop_first=False, prefix_sep='_')

telecom_cust_churn.head()
Out[19]:
gender SeniorCitizen Partner Dependents tenure PhoneService PaperlessBilling MonthlyCharges TotalCharges Churn ... DeviceProtection_Yes TechSupport_No TechSupport_No internet service TechSupport_Yes StreamingTV_No StreamingTV_No internet service StreamingTV_Yes StreamingMovies_No StreamingMovies_No internet service StreamingMovies_Yes
0 0 0 1 0 1 0 1 29.85 29.85 0 ... 0 1 0 0 1 0 0 1 0 0
1 1 0 0 0 34 1 0 56.95 1889.50 0 ... 1 1 0 0 1 0 0 1 0 0
2 1 0 0 0 2 1 1 53.85 108.15 1 ... 0 1 0 0 1 0 0 1 0 0
3 1 0 0 0 45 0 0 42.30 1840.75 0 ... 1 0 0 1 1 0 0 1 0 0
4 0 0 0 0 2 1 1 70.70 151.65 1 ... 0 1 0 0 1 0 0 1 0 0

5 rows × 41 columns

F. Split the data into 80% train and 20% test.¶

G. Normalize/Standardize the data with the best suitable approach.¶

In [20]:
telecom_cust_churn.describe().T
Out[20]:
count mean std min 25% 50% 75% max
gender 7043.0 0.504756 0.500013 0.00 0.000 1.00 1.00 1.00
SeniorCitizen 7043.0 0.162147 0.368612 0.00 0.000 0.00 0.00 1.00
Partner 7043.0 0.483033 0.499748 0.00 0.000 0.00 1.00 1.00
Dependents 7043.0 0.299588 0.458110 0.00 0.000 0.00 1.00 1.00
tenure 7043.0 32.371149 24.559481 0.00 9.000 29.00 55.00 72.00
PhoneService 7043.0 0.903166 0.295752 0.00 1.000 1.00 1.00 1.00
PaperlessBilling 7043.0 0.592219 0.491457 0.00 0.000 1.00 1.00 1.00
MonthlyCharges 7043.0 64.761692 30.090047 18.25 35.500 70.35 89.85 118.75
TotalCharges 7043.0 2283.300441 2265.000258 18.80 402.225 1400.55 3786.60 8684.80
Churn 7043.0 0.265370 0.441561 0.00 0.000 0.00 1.00 1.00
InternetService_DSL 7043.0 0.343746 0.474991 0.00 0.000 0.00 1.00 1.00
InternetService_Fiber optic 7043.0 0.439585 0.496372 0.00 0.000 0.00 1.00 1.00
InternetService_No 7043.0 0.216669 0.412004 0.00 0.000 0.00 0.00 1.00
Contract_Month-to-month 7043.0 0.550192 0.497510 0.00 0.000 1.00 1.00 1.00
Contract_One year 7043.0 0.209144 0.406726 0.00 0.000 0.00 0.00 1.00
Contract_Two year 7043.0 0.240664 0.427517 0.00 0.000 0.00 0.00 1.00
PaymentMethod_Bank transfer (automatic) 7043.0 0.219225 0.413751 0.00 0.000 0.00 0.00 1.00
PaymentMethod_Credit card (automatic) 7043.0 0.216101 0.411613 0.00 0.000 0.00 0.00 1.00
PaymentMethod_Electronic check 7043.0 0.335794 0.472301 0.00 0.000 0.00 1.00 1.00
PaymentMethod_Mailed check 7043.0 0.228880 0.420141 0.00 0.000 0.00 0.00 1.00
OnlineSecurity_No 7043.0 0.496663 0.500024 0.00 0.000 0.00 1.00 1.00
OnlineSecurity_No internet service 7043.0 0.216669 0.412004 0.00 0.000 0.00 0.00 1.00
OnlineSecurity_Yes 7043.0 0.286668 0.452237 0.00 0.000 0.00 1.00 1.00
MultipleLines_No 7043.0 0.481329 0.499687 0.00 0.000 0.00 1.00 1.00
MultipleLines_No phone service 7043.0 0.096834 0.295752 0.00 0.000 0.00 0.00 1.00
MultipleLines_Yes 7043.0 0.421837 0.493888 0.00 0.000 0.00 1.00 1.00
OnlineBackup_No 7043.0 0.438450 0.496232 0.00 0.000 0.00 1.00 1.00
OnlineBackup_No internet service 7043.0 0.216669 0.412004 0.00 0.000 0.00 0.00 1.00
OnlineBackup_Yes 7043.0 0.344881 0.475363 0.00 0.000 0.00 1.00 1.00
DeviceProtection_No 7043.0 0.439443 0.496355 0.00 0.000 0.00 1.00 1.00
DeviceProtection_No internet service 7043.0 0.216669 0.412004 0.00 0.000 0.00 0.00 1.00
DeviceProtection_Yes 7043.0 0.343888 0.475038 0.00 0.000 0.00 1.00 1.00
TechSupport_No 7043.0 0.493114 0.499988 0.00 0.000 0.00 1.00 1.00
TechSupport_No internet service 7043.0 0.216669 0.412004 0.00 0.000 0.00 0.00 1.00
TechSupport_Yes 7043.0 0.290217 0.453895 0.00 0.000 0.00 1.00 1.00
StreamingTV_No 7043.0 0.398978 0.489723 0.00 0.000 0.00 1.00 1.00
StreamingTV_No internet service 7043.0 0.216669 0.412004 0.00 0.000 0.00 0.00 1.00
StreamingTV_Yes 7043.0 0.384353 0.486477 0.00 0.000 0.00 1.00 1.00
StreamingMovies_No 7043.0 0.395428 0.488977 0.00 0.000 0.00 1.00 1.00
StreamingMovies_No internet service 7043.0 0.216669 0.412004 0.00 0.000 0.00 0.00 1.00
StreamingMovies_Yes 7043.0 0.387903 0.487307 0.00 0.000 0.00 1.00 1.00

Scaling 3 columns i.e., tenure, MonthlyCharges and MonthlyCharges, using z-score.

In [21]:
cols_to_scale = ["MonthlyCharges","TotalCharges","tenure"]
telecom_cust_churn[cols_to_scale]=scaler.fit_transform(telecom_cust_churn[cols_to_scale])
telecom_cust_churn.head()
Out[21]:
gender SeniorCitizen Partner Dependents tenure PhoneService PaperlessBilling MonthlyCharges TotalCharges Churn ... DeviceProtection_Yes TechSupport_No TechSupport_No internet service TechSupport_Yes StreamingTV_No StreamingTV_No internet service StreamingTV_Yes StreamingMovies_No StreamingMovies_No internet service StreamingMovies_Yes
0 0 0 1 0 -1.277445 0 1 -1.160323 -0.994971 0 ... 0 1 0 0 1 0 0 1 0 0
1 1 0 0 0 0.066327 1 0 -0.259629 -0.173876 0 ... 1 1 0 0 1 0 0 1 0 0
2 1 0 0 0 -1.236724 1 1 -0.362660 -0.960399 1 ... 0 1 0 0 1 0 0 1 0 0
3 1 0 0 0 0.514251 0 0 -0.746535 -0.195400 0 ... 1 0 0 1 1 0 0 1 0 0
4 0 0 0 0 -1.236724 1 1 0.197365 -0.941193 1 ... 0 1 0 0 1 0 0 1 0 0

5 rows × 41 columns

In [22]:
telecom_cust_churn.Churn.value_counts()
Out[22]:
0    5174
1    1869
Name: Churn, dtype: int64

Balancing Churn data set

In [23]:
telecom_cust_churn_balanced=telecom_cust_churn[telecom_cust_churn.Churn==0].sample(n=1869, random_state=42).append(telecom_cust_churn[telecom_cust_churn.Churn==1])
telecom_cust_churn_balanced.reset_index(drop=True, inplace=True)
telecom_cust_churn_balanced.Churn.value_counts()
Out[23]:
0    1869
1    1869
Name: Churn, dtype: int64
In [24]:
X_train, X_test, y_train, y_test = train_test_split(telecom_cust_churn_balanced.drop(columns='Churn', axis=1),telecom_cust_churn_balanced.Churn, test_size=0.2, random_state=42)
X_train.shape, X_test.shape
Out[24]:
((2990, 40), (748, 40))

3. Model building and Improvement.¶

A. Train a model using XGBoost. Also print best performing parameters along with train and test performance.¶

In [25]:
model=XGBClassifier(random_state=42)
model.fit(X_train,y_train)

y_pred = model.predict(X_test)
y_train_pred = model.predict(X_train)

sns.heatmap(confusion_matrix(y_test,y_pred), annot=True, fmt='.2f')

print('Train Accuracy Score:',accuracy_score(y_train,y_train_pred))
print('Test Accuracy Score:',accuracy_score(y_test,y_pred))
print('ROC AUC Score:',roc_auc_score(y_test,y_pred))
print("\nClassification Matrix:\n",classification_report(y_test, y_pred))

model_list= [['XGBClassifier', accuracy_score(y_train, y_train_pred), accuracy_score(y_test, y_pred), roc_auc_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred),f1_score(y_test, y_pred)]]
Train Accuracy Score: 0.9719063545150501
Test Accuracy Score: 0.7459893048128342
ROC AUC Score: 0.7462656684614338

Classification Matrix:
               precision    recall  f1-score   support

           0       0.76      0.73      0.74       379
           1       0.73      0.77      0.75       369

    accuracy                           0.75       748
   macro avg       0.75      0.75      0.75       748
weighted avg       0.75      0.75      0.75       748

In [26]:
pd.DataFrame(model.feature_importances_,columns=['Imp'],index=X_train.columns).sort_values(by='Imp',ascending=False)
Out[26]:
Imp
Contract_Month-to-month 0.439045
InternetService_Fiber optic 0.174799
OnlineSecurity_No 0.030087
Contract_One year 0.026698
StreamingMovies_Yes 0.025702
TechSupport_No 0.023378
Contract_Two year 0.022211
PaymentMethod_Electronic check 0.015796
StreamingTV_Yes 0.014391
tenure 0.013428
StreamingMovies_No 0.013266
PhoneService 0.013258
DeviceProtection_No 0.012397
OnlineBackup_No 0.012224
OnlineBackup_Yes 0.011853
InternetService_DSL 0.010373
PaymentMethod_Mailed check 0.010370
PaymentMethod_Bank transfer (automatic) 0.010278
MultipleLines_No 0.009929
MonthlyCharges 0.009542
InternetService_No 0.009512
SeniorCitizen 0.009414
Dependents 0.009345
TotalCharges 0.009277
PaperlessBilling 0.009059
MultipleLines_Yes 0.008953
PaymentMethod_Credit card (automatic) 0.008905
gender 0.008417
Partner 0.008081
StreamingTV_No 0.006544
OnlineSecurity_Yes 0.004899
DeviceProtection_Yes 0.004841
TechSupport_Yes 0.003724
OnlineBackup_No internet service 0.000000
DeviceProtection_No internet service 0.000000
MultipleLines_No phone service 0.000000
TechSupport_No internet service 0.000000
StreamingTV_No internet service 0.000000
StreamingMovies_No internet service 0.000000
OnlineSecurity_No internet service 0.000000

Contract_Month-to-month feature has the highest importance in model.

There are 7 features that can be removed from our model, as they have 0 importance, these features are:

  • OnlineBackup_No internet service
  • DeviceProtection_No internet service
  • MultipleLines_No phone service
  • TechSupport_No internet service
  • StreamingTV_No internet service
  • StreamingMovies_No internet service
  • OnlineSecurity_No internet service

Removing features which have 0 importance.

In [27]:
X_rand_train, X_rand_test, y_rand_train, y_rand_test = train_test_split(telecom_cust_churn_balanced.drop(columns=['Churn','OnlineBackup_No internet service','DeviceProtection_No internet service','MultipleLines_No phone service','TechSupport_No internet service','StreamingTV_No internet service','StreamingMovies_No internet service','OnlineSecurity_No internet service'], axis=1),telecom_cust_churn_balanced.Churn, test_size=0.2, random_state=42)
X_rand_train.shape, X_rand_test.shape
Out[27]:
((2990, 33), (748, 33))
In [28]:
params = {
 'learning_rate':[0.05,0.10,0.15,0.20],
 'max_depth':[3,4,5,6],
 'min_child_weight':[1,3,5],
 'gamma':[0.1,0.2,0.3,0.4],
 'colsample_bytree':[0.3,0.4,0.5],
 'n_estimators':[20,40,50,60]
}

rs_model=RandomizedSearchCV(model,param_distributions=params,n_iter=5,scoring='roc_auc',n_jobs=-1,cv=5, random_state=42)

rs_model.fit(X_rand_train,y_rand_train)

# print best parameter after tuning
print(rs_model.best_params_)

rs_model.best_estimator_.fit(X_rand_train,y_rand_train)

y_rand_pred = rs_model.best_estimator_.predict(X_rand_test)
y_rand_train_pred = rs_model.best_estimator_.predict(X_rand_train)

sns.heatmap(confusion_matrix(y_rand_test,y_rand_pred), annot=True, fmt='.2f')

print('Train Accuracy Score:',accuracy_score(y_rand_train,y_rand_train_pred))
print('Test Accuracy Score:',accuracy_score(y_rand_test,y_rand_pred))
print('ROC AUC Score:',roc_auc_score(y_rand_test,y_rand_pred))
print("\nClassification Matrix:\n",classification_report(y_rand_test, y_rand_pred))

model_list.append(['XGBClassifier_RandomizedSearchCV', accuracy_score(y_rand_train,y_rand_train_pred), accuracy_score(y_rand_test, y_rand_pred), roc_auc_score(y_rand_test,y_rand_pred), precision_score(y_rand_test,y_rand_pred), recall_score(y_rand_test,y_rand_pred), f1_score(y_rand_test,y_rand_pred)])
{'n_estimators': 50, 'min_child_weight': 3, 'max_depth': 3, 'learning_rate': 0.15, 'gamma': 0.1, 'colsample_bytree': 0.5}
Train Accuracy Score: 0.802675585284281
Test Accuracy Score: 0.7807486631016043
ROC AUC Score: 0.7809954880551444

Classification Matrix:
               precision    recall  f1-score   support

           0       0.80      0.76      0.78       379
           1       0.77      0.80      0.78       369

    accuracy                           0.78       748
   macro avg       0.78      0.78      0.78       748
weighted avg       0.78      0.78      0.78       748

In [29]:
pd.DataFrame(model_list,columns=['Model','Train_Accuracy','Test_Accuracy','ROC_AUC','Precision','Recall','F1 Score']).sort_values(by=['Recall','F1 Score'], ascending=False)
Out[29]:
Model Train_Accuracy Test_Accuracy ROC_AUC Precision Recall F1 Score
1 XGBClassifier_RandomizedSearchCV 0.802676 0.780749 0.780995 0.766234 0.799458 0.782493
0 XGBClassifier 0.971906 0.745989 0.746266 0.731266 0.766938 0.748677
In [30]:
pd.DataFrame(rs_model.best_estimator_.feature_importances_,columns=['Imp'],index=X_rand_train.columns).sort_values(by='Imp',ascending=False)
Out[30]:
Imp
Contract_Month-to-month 0.387774
OnlineSecurity_No 0.076351
PaymentMethod_Electronic check 0.074828
Contract_Two year 0.059022
TechSupport_No 0.045519
OnlineSecurity_Yes 0.035582
InternetService_No 0.031923
InternetService_Fiber optic 0.031788
tenure 0.023499
Contract_One year 0.021907
StreamingMovies_Yes 0.020114
TotalCharges 0.016778
PaperlessBilling 0.016757
InternetService_DSL 0.014821
PaymentMethod_Bank transfer (automatic) 0.014153
MonthlyCharges 0.013401
StreamingTV_Yes 0.011435
MultipleLines_No 0.010613
DeviceProtection_No 0.010299
DeviceProtection_Yes 0.010281
MultipleLines_Yes 0.008929
TechSupport_Yes 0.008168
OnlineBackup_No 0.008108
PaymentMethod_Credit card (automatic) 0.007684
SeniorCitizen 0.007103
Dependents 0.007018
gender 0.005976
StreamingTV_No 0.005315
PaymentMethod_Mailed check 0.005204
PhoneService 0.004909
OnlineBackup_Yes 0.004741
Partner 0.000000
StreamingMovies_No 0.000000

TechSupport_No feature has the highest importance in model. There are 2 features that can be removed from our model, as they have 0 importance, these features are:

  • OnlineSecurity_Yes
  • PhoneService

After tuning some parameters using RandomizedSearchCV, accuracy has been hiked. Hyperparameters that are used {'n_estimators': 40, 'min_child_weight': 5, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 0.4, 'colsample_bytree': 0.4}. Train_Accuracy=80%, Test_Accuracy=78%

B. Improve performance of the XGBoost as much as possible. Also print best performing parameters along with train and test performance.¶

In [31]:
X_perf_train, X_perf_test, y_perf_train, y_perf_test = train_test_split(telecom_cust_churn_balanced.drop(columns=['Churn','OnlineBackup_No internet service','DeviceProtection_No internet service','MultipleLines_No phone service','TechSupport_No internet service','StreamingTV_No internet service','StreamingMovies_No internet service','OnlineSecurity_No internet service','OnlineSecurity_Yes','PhoneService'], axis=1),telecom_cust_churn_balanced.Churn, test_size=0.2, random_state=42)
X_perf_train.shape, X_perf_test.shape
Out[31]:
((2990, 31), (748, 31))
In [32]:
# defining parameter range
param_grid = {
 'learning_rate':[0.05,0.10,0.15,0.20],
 'max_depth':[3,4,5,6],
 'min_child_weight':[1,3,5],
 'gamma':[0.1,0.2,0.3,0.4],
 'colsample_bytree':[0.3,0.4,0.5],
 'n_estimators':[20,40,50,60]
}

grid = GridSearchCV(XGBClassifier(random_state=42), param_grid,scoring='roc_auc',n_jobs=-1,cv=5,refit=True)

# fitting the model for grid search
grid.fit(X_perf_train,y_perf_train)

# print best parameter after tuning
print(grid.best_params_)
{'colsample_bytree': 0.3, 'gamma': 0.3, 'learning_rate': 0.1, 'max_depth': 4, 'min_child_weight': 1, 'n_estimators': 50}
In [33]:
grid.best_estimator_.fit(X_perf_train,y_perf_train)

y_perf_pred = grid.best_estimator_.predict(X_perf_test)
y_perf_train_pred = grid.best_estimator_.predict(X_perf_train)

sns.heatmap(confusion_matrix(y_test,y_perf_pred), annot=True, fmt='.2f')

print('Train Accuracy Score:',accuracy_score(y_perf_train,y_perf_train_pred))
print('Test Accuracy Score:',accuracy_score(y_perf_test,y_perf_pred))
print('ROC AUC Score:',roc_auc_score(y_perf_test,y_perf_pred))
print("\nClassification Matrix:\n",classification_report(y_perf_test, y_perf_pred))

model_list.append(['XGBClassifier_GridSearchCV', accuracy_score(y_perf_train,y_perf_train_pred), accuracy_score(y_perf_test, y_perf_pred), roc_auc_score(y_perf_test,y_perf_pred), precision_score(y_perf_test,y_perf_pred), recall_score(y_perf_test,y_perf_pred), f1_score(y_perf_test,y_perf_pred)])
Train Accuracy Score: 0.8096989966555184
Test Accuracy Score: 0.7740641711229946
ROC AUC Score: 0.7743276773137124

Classification Matrix:
               precision    recall  f1-score   support

           0       0.79      0.75      0.77       379
           1       0.76      0.79      0.78       369

    accuracy                           0.77       748
   macro avg       0.77      0.77      0.77       748
weighted avg       0.77      0.77      0.77       748

In [34]:
pd.DataFrame(model_list,columns=['Model','Train_Accuracy','Test_Accuracy','ROC_AUC','Precision','Recall','F1 Score']).sort_values(by=['Recall','F1 Score'], ascending=False)
Out[34]:
Model Train_Accuracy Test_Accuracy ROC_AUC Precision Recall F1 Score
1 XGBClassifier_RandomizedSearchCV 0.802676 0.780749 0.780995 0.766234 0.799458 0.782493
2 XGBClassifier_GridSearchCV 0.809699 0.774064 0.774328 0.759067 0.794038 0.776159
0 XGBClassifier 0.971906 0.745989 0.746266 0.731266 0.766938 0.748677

Different performance activities and params has been tuned, which has increased accuracy with recall and F1 score

#¶

PART-B¶

#¶

1. Build a simple ML workflow which will accept a single ‘.csv’ file as input and return a trained base model that can be used for predictions. You can use 1 Dataset from Part 1 (single/merged).¶

2. Create separate functions for various purposes.¶

3. Various base models should be trained to select the best performing model.¶

4. Pickle file should be saved for the best performing model.¶

Include best coding practices in the code:¶

  • Modularization
  • Maintainability
  • Well commented code etc.
In [35]:
def load_read_dataset(filename, names=None, delimiter=',', usecol=None):
    """
    load_read_dataset function reads file and load into dataframe for further analysis.

    :param filename: take name of file as input, ex: 'credit.csv'
    :param names: if file didn't contain any column names or if user want to provide different columns, ex: ['purpose','amount','age','default']
    :param delimiter: by default .csv file has comma(,) as separator, but user can provide different separator also, ex: ','
    :param usecol: param you can select columns to load from the CSV file, ex: ['age','default']

    :return: dataframe object is returned.
    """

    print("Reading dataset filename:{filename}, names:{names}, delimiter:{delimiter}, usecol:{usecol}".format(filename=filename, names=names, delimiter=delimiter, usecol=usecol))

    # Reading .csv file with custom inputs
    df1 = pd.read_csv(filename, sep=delimiter, names=names, usecols=usecol)

    # Printing total number of entries with features
    print(f"Dataset have {df1.shape[0]} entries with {df1.shape[1]} features.\n")

    # print dataframe information
    print('Dataframe Information:')
    print(df1.dtypes)

    # checking random 5 data from dataset
    print('\n5 Sample set from dataframe:')
    print(df1.sample(5))

    print('Dataset loaded successfully.')
    return df1
In [36]:
def check_duplicate(df1):
    """
    check_duplicate function eliminates duplicate from dataframe if present.

    :param df1: Pandas dataframe object

    :return: dataframe object is returned.
    """

    print('Duplicate check analysis started.')

    df_dup = df1[df1.duplicated()]

    if df_dup.shape[0]!=0:
        print('\nDuplicate founded:\n')
        print(df_dup)
        print('\nRemoving duplicates\n')

        df1.drop_duplicates(inplace=True)

    print('Duplicate check analysis completed.')
    return df1

def empty_data_check(df1):
    """
    empty_data_check function check for null data values. This function also provides user to define structure for substitution of values.

    :param df1: Pandas dataframe object

    :return: dataframe object is returned.
    """

    print('Empty data check analysis started.')

    column_names=df1.columns

    for col_name in column_names:
        if is_numeric_dtype(df1[col_name]) and df1[col_name].isnull().sum()>0:
            df1[col_name].fillna(df1[col_name].median())

        if is_string_dtype(df1[col_name]):
            val=df1[col_name].mode()[0]
            df1[col_name].fillna(val)
            df1[col_name]=df1[col_name].replace('', val)

    print('Empty data check analysis completed.')
    return df1

def object_numeric_type_conversion(df1, object_numeric_names):
    """
    object_numeric_type_conversion function convert object dtype to numeric dtype.

    :param df1: Pandas dataframe object
    :param object_numeric_names: all column names that need to be converted to numeric dtype

    :return: dataframe object is returned.
    """

    print('Object to numeric conversion started')

    def check(val):
        if not(val and val.strip()):
            return np.nan
        return ''.join(re.findall(r'\d+', val))

    for col_name in object_numeric_names:
        df1[col_name]=df1[col_name].apply(check)
        df1[col_name]=pd.to_numeric(df1[col_name], errors='coerce')
        df1[col_name].fillna(df1[col_name].mean(),inplace = True)

    print('Object to numeric conversion completed')
    return df1

def outlier_check(df1):
    """
    outlier_check function check outliers if present, then substitute with the IQR values.

    :param df1: Pandas dataframe object

    :return: dataframe object is returned.
    """

    print('Outlier check analysis started.')

    column_names=df1.columns

    for col_name in column_names:
        if is_numeric_dtype(df1[col_name]):
            q1=np.quantile(df1[col_name], 0.25)
            q3=np.quantile(df1[col_name], 0.75)
            cut_off=1.5*(q3-q1)
            right_whisker= q3 + cut_off
            left_whiskers=q1 - cut_off

            #Replace every outlier on the upper side by the upper whisker
            for i in np.where(df1[col_name] > right_whisker)[0]:
                df1.loc[i,col_name] = right_whisker

            #Replace every outlier on the lower side by the lower whisker
            for i in np.where(df1[col_name] < left_whiskers)[0]:
                df1.loc[i,col_name] = left_whiskers

    print('Outlier check analysis completed')
    return df1

def pre_processing(df1, object_numeric_names):
    """
    pre_processing function checks for duplicates, empty data and outliers.

    :param df1: Pandas dataframe object
    :param object_numeric_names: all column names that need to be converted to numeric dtype

    :return: dataframe object is returned.
    """

    print('Pre-processing analysis started')

    df1=check_duplicate(df1)
    df1=empty_data_check(df1)
    df1=object_numeric_type_conversion(df1, object_numeric_names)
    df1=outlier_check(df1)

    print('Pre-processing analysis completed')
    return df1
In [37]:
def visualization(df1, pie_col_names):
    """
    visualization function represents 3 variation of graph i.e. pie chart, pair-plot and heat map.

    :param df1: Pandas dataframe object
    :param pie_col_names: names of column that need to be represented in pie chart
    """

    print('Data Visualization with different features.')

    print('Pie Chart Visualization')
    for pie_col_name in pie_col_names:
        pie_graph=px.pie(df1, hole=0.3, values=df1[pie_col_name].value_counts(), names=df1[pie_col_name].value_counts().index, color_discrete_sequence=px.colors.sequential.turbid_r, title=pie_col_name+' Column Representation')
        pie_graph.show()

    print('Pair-Plot Visualization')
    sns.pairplot(df1)
    plt.show()

    print('HeatMap Visualization')
    sns.heatmap(df1.corr(), annot=True, fmt='.2f')
    plt.show()
In [38]:
def data_conversion(df1, replace_struct, cat_numeric_name, one_hot_encoder_name, standardization_col_name):
    """
    data_conversion function convert data as per the need.

    :param df1: Pandas dataframe object
    :param replace_struct: user defined structure to update values in dataset, ex: {
                "checking_balance":{"< 0 DM":1,"1-200 DM":2,"> 200 DM":3},"job":{"unemployed":1,"unskilled":2,"skilled":3} }
    :param cat_numeric_name: names of column that needed to be converted from category type to numeric type
    :param one_hot_encoder_name: names of column that convert categorical variable into dummy/indicator variables.
    :param standardization_col_name: names of column that standardize features by removing the mean and scaling to unit variance.

    :return: dataframe object is returned.
    """

    print('Data conversion on the basis of LabelEncoder or OneHotEncoder or user defined structure')

    if cat_numeric_name:
        for col_name in cat_numeric_name:
            df1[col_name]=label_encoder.fit_transform(df1[col_name])

    if replace_struct:
        df1=df1.replace(replace_struct)

    if one_hot_encoder_name:
        df1=pd.get_dummies(df1, columns=one_hot_encoder_name)

    if standardization_col_name:
        df1[standardization_col_name]=scaler.fit_transform(df1[standardization_col_name])

    print('Data conversion completed.')
    return df1
In [39]:
def features_removal(df1, feature_names):
    """
    features_removal function removes features that aren't needed for model training.

    :param df1: Pandas dataframe object
    :param feature_names: names of column that need to be removed.

    :return: dataframe object is returned.
    """

    print('Eliminating unwanted features for model training')

    df1.drop(columns=feature_names, axis=1, inplace=True)
In [40]:
def train_model(df1, is_regression, target_value, test_size_percent):
    """
    train_model function removes features that aren't needed for model training.

    :param df1: Pandas dataframe object
    :param is_regression: model to be trained as regressor or classifier
    :param target_value: target output
    :param test_size_percent: the proportion of the dataset to include in the test split.

    :return list of trained models with metrics
    """

    model_details,param_details,train_model_lists={},{},[]

    X,y=sm.fit_resample(df1.drop(target_value, axis=1), df1[target_value])

    X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=test_size_percent/100, random_state=42)

    if is_regression:
        model_details={
            'Linear_Regression': LinearRegression(),
            'Ridge_Regression': Ridge(random_state=42),
            'Lasso_Regression': Lasso(random_state=42),
            'KNN_Regression': KNeighborsRegressor(),
            'SVM_Regression': SVR(),
            'DecisionTree_Regression': DecisionTreeRegressor(random_state=42),
            'RandomForest_Regression': RandomForestRegressor(random_state=42),
            'GradientBoosting_Regression': GradientBoostingRegressor(random_state=42),
            'AdaBoost_Regression': AdaBoostRegressor(random_state=42),
            'XGB_Regression': XGBRegressor(random_state=42)
        }
        param_details={
            'Linear_Regression': {},
            'Ridge_Regression': {'alpha': np.logspace(-8, 8, 100), 'fit_intercept': [True, False], 'solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']},
            'Lasso_Regression': {'alpha': np.logspace(-8, 8, 100)},
            'KNN_Regression':{'n_neighbors': np.arange(1,int(np.sqrt(len(X_train)))).tolist(),'p': [1,2],'weights': ['uniform','distance']},
            'SVM_Regression': {'kernel':['linear','rbf'],'C':[0.1,1,10,100],'gamma':[1,0.1,0.001]},
            'DecisionTree_Regression': {'criterion':['squared_error','friedman_mse','absolute_error','poisson'],'min_samples_split':[10,20,40],'max_depth':[2,6,8],'min_samples_leaf':[20,40,100],'max_leaf_nodes':[5,20,100]},
            'RandomForest_Regression': {'max_depth': [5,10,None], 'max_features': ['sqrt','log2',None], 'n_estimators': [5,6,7,8,9,10,11,12,13,15]},
            'GradientBoosting_Regression': {'learning_rate': [0.05,0.10,0.15],'min_samples_split': np.linspace(0.1,0.5,6),'min_samples_leaf': np.linspace(0.1,0.5,6),'max_depth':[3,5,8],'max_features':['log2','sqrt'],'subsample':[0.5,1.0],'n_estimators':[20,40,50,60]},
            'AdaBoost_Regression': {'n_estimators': [20,40,50,60],'learning_rate': [0.05,0.10,0.15]},
            'XGB_Regression': {'learning_rate':[0.05,0.10,0.15],'max_depth':[3,4,5,6],'min_child_weight':[1,3,5],'gamma':[0.1,0.2,0.3],'colsample_bytree':[0.3,0.4,0.5],'n_estimators':[20,40,50,60]}
        }
    else:
        model_details={
            'KNN_Classifier': KNeighborsClassifier(),
            'SVM_Classifier': SVC(random_state=42),
            'XGB_Classifier': XGBClassifier(random_state=42),
            'GaussianNB': GaussianNB(),
            'GradientBoosting_Classifier': GradientBoostingClassifier(random_state=42),
            'AdaBoost_Classifier': AdaBoostClassifier(random_state=42),
            'RandomForest_Classifier': RandomForestClassifier(random_state=42),
            'DecisionTree_Classifier': DecisionTreeClassifier(random_state=42),
        }
        param_details={
            'KNN_Classifier': {'n_neighbors': np.arange(1,int(np.sqrt(len(X_train)))).tolist(), 'p': [1,2]},
            'SVM_Classifier': {'C':[0.1,1,10,100],'gamma':[1,0.1,0.001], 'kernel':['linear','rbf']},
            'XGB_Classifier': {'learning_rate':[0.05,0.10,0.15],'max_depth':[3,4,5,6],'min_child_weight':[1,3,5],'gamma':[0.1,0.2,0.3],'colsample_bytree':[0.3,0.4,0.5],'n_estimators':[20,40,50,60]},
            'GaussianNB': {'var_smoothing': np.logspace(0,-9, num=100)},
            'GradientBoosting_Classifier': {'learning_rate': [0.05,0.10,0.15],'min_samples_split': np.linspace(0.1,0.5,6),'min_samples_leaf': np.linspace(0.1,0.5,6),'max_depth':[3,5,8],'max_features':['log2','sqrt'],'subsample':[0.5,1.0],'n_estimators':[20,40,50,60]},
            'AdaBoost_Classifier': {'n_estimators': [20,40,50,60],'learning_rate': [0.05,0.10,0.15]},
            'RandomForest_Classifier': {'n_estimators': [20,40,50,60],'max_depth' : [4,5,6,7,8],'criterion' :['gini','entropy']},
            'DecisionTree_Classifier': {'ccp_alpha': [0.1,.01,.001],'max_depth': [5, 6, 7, 8, 9],'criterion' :['gini', 'entropy']},
        }

    for key in model_details.keys():
        print("Running GridSearchCV for %s." % key)

        grid_search = GridSearchCV(model_details.get(key), param_details.get(key), cv=3, n_jobs=3, refit=True)
        grid_search.fit(X_train,y_train)

        predicted_model=grid_search.best_estimator_

        predicted_model.fit(X_train,y_train)

        y_pred = predicted_model.predict(X_test)
        y_train_pred = predicted_model.predict(X_train)

        if is_regression:
            train_model_lists.append([key, predicted_model.score(X_train,y_train), predicted_model.score(X_test,y_test),
                mean_absolute_error(y_test,y_pred), mean_squared_error(y_test,y_pred), np.sqrt(mean_squared_error(y_test,y_pred)), r2_score(y_test, y_pred), predicted_model])
        else:
            train_model_lists.append([key, accuracy_score(y_train,y_train_pred), accuracy_score(y_test, y_pred),
                roc_auc_score(y_test,y_pred), precision_score(y_test,y_pred), recall_score(y_test,y_pred), f1_score(y_test,y_pred), predicted_model])

        print("GridSearchCV for %s completed." % key)

    return train_model_lists
In [41]:
def predict_best_model(train_model_lists, is_regression):
    """
    predict_best_model function compares differnet models over metrics and return best fit model.

    :param train_model_lists: list of trained models with metrics
    :param is_regression: model to be trained as regressor or classifier

    :return best predicted model object
    """

    print('Different models with metrics')
    model_df=None
    if is_regression:
        model_df=pd.DataFrame(train_model_lists,columns=['Model_Name','Train_Accuracy','Test_Accuracy','MAE','MSE','RMSE','R2_Score', 'Model']).sort_values(by=['R2_Score'], ascending=False)
    else:
        model_df=pd.DataFrame(train_model_lists,columns=['Model_Name','Train_Accuracy','Test_Accuracy','ROC_AUC','Precision','Recall','F1 Score', 'Model']).sort_values(by=['Recall','F1 Score'], ascending=False)

    print(model_df)

    return model_df.head(1).Model.values[0]
In [42]:
def process(filename, object_numeric_names, pie_col_names, replace_struct, cat_numeric_name, one_hot_encoder_name, standardization_col_name, feature_names, is_regression, target_value, test_size_percent, names=None, delimiter=',', usecol=None):
    """
    process function contains differnet types of sub process which at end provides best fit model.

    :param filename: take name of file as input, ex: 'credit.csv'
    :param object_numeric_names: all column names that need to be converted to numeric dtype
    :param pie_col_names: names of column that need to be represented in pie chart
    :param replace_struct: user defined structure to update values in dataset, ex: {
                "checking_balance":{"< 0 DM":1,"1-200 DM":2,"> 200 DM":3},"job":{"unemployed":1,"unskilled":2,"skilled":3} }
    :param cat_numeric_name: names of column that needed to be converted from category type to numeric type
    :param one_hot_encoder_name: names of column that convert categorical variable into dummy/indicator variables.
    :param standardization_col_name: names of column that standardize features by removing the mean and scaling to unit variance.
    :param feature_names: names of column that need to be removed.
    :param is_regression: model to be trained as regressor or classifier
    :param target_value: target output
    :param test_size_percent: the proportion of the dataset to include in the test split.
    :param names: if file didn't contain any column names or if user want to provide different columns, ex: ['purpose','amount','age','default']
    :param delimiter: by default .csv file has comma(,) as separator, but user can provide different separator also, ex: ','
    :param usecol: param you can select columns to load from the CSV file, ex: ['age','default']

    :return best predicted model object
    """

    df1=load_read_dataset(filename, names, delimiter, usecol)

    df1=pre_processing(df1, object_numeric_names)

    visualization(df1, pie_col_names)

    df1=data_conversion(df1, replace_struct, cat_numeric_name, one_hot_encoder_name, standardization_col_name)

    features_removal(df1, feature_names)

    train_model_lists=train_model(df1, is_regression, target_value, test_size_percent)

    return predict_best_model(train_model_lists, is_regression)
In [43]:
# Classification
model=process('TelcomCustomer-Churn_2.csv',['TotalCharges'],['OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies','Contract','PaperlessBilling','PaymentMethod'],None,['PaperlessBilling','Churn'],['OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies','Contract','PaymentMethod'],['MonthlyCharges','TotalCharges'],['customerID'],False,'Churn',20)
print(model)

# create an iterator object with write permission - model.pkl
with open('model_pkl', 'wb') as files:
    pickle.dump(model, files)
Reading dataset filename:TelcomCustomer-Churn_2.csv, names:None, delimiter:,, usecol:None
Dataset have 7043 entries with 12 features.

Dataframe Information:
customerID           object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

5 Sample set from dataframe:
      customerID         OnlineBackup     DeviceProtection  \
3422  1837-YQUCE                   No                  Yes   
5362  0485-ZBSLN  No internet service  No internet service   
6821  8784-CGILN                   No                   No   
3188  0587-DMGBH                   No                   No   
2135  4010-YLMVT                  Yes                   No   

              TechSupport          StreamingTV      StreamingMovies  \
3422                  Yes                  Yes                  Yes   
5362  No internet service  No internet service  No internet service   
6821                  Yes                  Yes                  Yes   
3188                   No                   No                   No   
2135                   No                  Yes                  Yes   

            Contract PaperlessBilling              PaymentMethod  \
3422        Two year              Yes  Bank transfer (automatic)   
5362        Two year               No    Credit card (automatic)   
6821  Month-to-month              Yes  Bank transfer (automatic)   
3188  Month-to-month              Yes           Electronic check   
2135  Month-to-month              Yes    Credit card (automatic)   

      MonthlyCharges TotalCharges Churn  
3422           58.35      4214.25    No  
5362           24.75       1715.1    No  
6821           99.85      1776.95   Yes  
3188           49.85       365.55   Yes  
2135          106.60      5893.95    No  
Dataset loaded successfully.
Pre-processing analysis started
Duplicate check analysis started.
Duplicate check analysis completed.
Empty data check analysis started.
Empty data check analysis completed.
Object to numeric conversion started
Object to numeric conversion completed
Outlier check analysis started.
Outlier check analysis completed
Pre-processing analysis completed
Data Visualization with different features.
Pie Chart Visualization
Pair-Plot Visualization
HeatMap Visualization
Data conversion on the basis of LabelEncoder or OneHotEncoder or user defined structure
Data conversion completed.
Eliminating unwanted features for model training
Running GridSearchCV for KNN_Classifier.
GridSearchCV for KNN_Classifier completed.
Running GridSearchCV for SVM_Classifier.
GridSearchCV for SVM_Classifier completed.
Running GridSearchCV for XGB_Classifier.
GridSearchCV for XGB_Classifier completed.
Running GridSearchCV for GaussianNB.
GridSearchCV for GaussianNB completed.
Running GridSearchCV for GradientBoosting_Classifier.
GridSearchCV for GradientBoosting_Classifier completed.
Running GridSearchCV for AdaBoost_Classifier.
GridSearchCV for AdaBoost_Classifier completed.
Running GridSearchCV for RandomForest_Classifier.
GridSearchCV for RandomForest_Classifier completed.
Running GridSearchCV for DecisionTree_Classifier.
GridSearchCV for DecisionTree_Classifier completed.
Different models with metrics
                    Model_Name  Train_Accuracy  Test_Accuracy   ROC_AUC  \
3                   GaussianNB        0.749577       0.755072  0.753518   
1               SVM_Classifier        0.839937       0.813043  0.812375   
6      RandomForest_Classifier        0.811186       0.795652  0.794759   
2               XGB_Classifier        0.848273       0.807729  0.807028   
0               KNN_Classifier        0.996497       0.812560  0.812108   
5          AdaBoost_Classifier        0.752839       0.758454  0.757482   
4  GradientBoosting_Classifier        0.766973       0.780676  0.780048   
7      DecisionTree_Classifier        0.779778       0.779710  0.779395   

   Precision    Recall  F1 Score  \
3   0.711719  0.868446  0.782310   
1   0.788831  0.861773  0.823690   
6   0.765254  0.860820  0.810229   
2   0.782798  0.858913  0.819091   
0   0.796945  0.845567  0.820537   
5   0.730479  0.829361  0.776786   
4   0.761194  0.826501  0.792505   
7   0.771769  0.802669  0.786916   

                                               Model  
3    GaussianNB(var_smoothing=0.0012328467394420659)  
1             SVC(C=100, gamma=0.1, random_state=42)  
6  (DecisionTreeClassifier(max_depth=8, max_featu...  
2  XGBClassifier(base_score=0.5, booster='gbtree'...  
0                KNeighborsClassifier(n_neighbors=1)  
5  (DecisionTreeClassifier(max_depth=1, random_st...  
4  ([DecisionTreeRegressor(criterion='friedman_ms...  
7  DecisionTreeClassifier(ccp_alpha=0.001, max_de...  
GaussianNB(var_smoothing=0.0012328467394420659)
In [44]:
with open('model_pkl' , 'rb') as f:
    pickle_model = pickle.load(f)
print(pickle_model)
GaussianNB(var_smoothing=0.0012328467394420659)
In [ ]: